Homework 3: Some of my best friends are Zombies :0

Deadites are in the same genus no? Mommie’s with the maggots now!
Deadites are in the same genus no? Mommie’s with the maggots now!

1 Questions

1.1 Libraries

library(tidyverse)
library(rmdformats)
library(curl)

1.2 Question 1

First we must load in the data!

zombies <- read_csv(curl("https://raw.githubusercontent.com/fuzzyatelin/fuzzyatelin.github.io/master/AN588_Fall23/zombies.csv")); head(zombies) # gender here is slaying, first time nbs have had any rep in the gender variable in most datasets ever
## Rows: 1000 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): first_name, last_name, gender, major
## dbl (6): id, height, weight, zombies_killed, years_of_education, age
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 6 × 10
##      id first_name last_name gender height weight zombies_killed
##   <dbl> <chr>      <chr>     <chr>   <dbl>  <dbl>          <dbl>
## 1     1 Sarah      Little    Female   62.9   132.              2
## 2     2 Mark       Duncan    Male     67.8   146.              5
## 3     3 Brandon    Perez     Male     72.1   153.              1
## 4     4 Roger      Coleman   Male     66.8   130.              5
## 5     5 Tammy      Powell    Female   64.7   132.              4
## 6     6 Anthony    Green     Male     71.2   153.              1
## # ℹ 3 more variables: years_of_education <dbl>, major <chr>, age <dbl>
# going variable by variable finding means
pop_height_mean <- mean(zombies$height); pop_height_mean
## [1] 67.6301
pop_weight_mean <- mean(zombies$weight); pop_weight_mean
## [1] 143.9075
pop_zkills_mean <- mean(zombies$zombies_killed); pop_zkills_mean
## [1] 2.992
pop_yrsedu_mean <- mean(zombies$years_of_education); pop_yrsedu_mean
## [1] 2.996
pop_age_mean <- mean(zombies$age); pop_age_mean
## [1] 20.04696
pop_var <- function(data){ # we're making a population variance function
  # the math works out here
  var(data) * ((length(data) - 1)/(length(data)))
}
# going variable by variable again
pop_height_var <- pop_var(zombies$height); pop_height_var
## [1] 18.55861
pop_weight_var <- pop_var(zombies$weight); pop_weight_var
## [1] 338.2604
pop_zkills_var <- pop_var(zombies$zombies_killed); pop_zkills_var
## [1] 3.053936
pop_yrsedu_var <- pop_var(zombies$years_of_education); pop_yrsedu_var
## [1] 2.807984
pop_age_var <- pop_var(zombies$age); pop_age_var
## [1] 8.782822

1.3 Question 2

# ggplot by each variable
# gender/sex is more real thank you very much (but also I hate the term)

zombies %>% ggplot(aes(x = height, color = gender)) +
  geom_boxplot() +
  ggtitle("Height by gender/sex")

zombies %>% ggplot(aes(x = weight, color = gender)) +
  geom_boxplot() +
  ggtitle("Weight by gender/sex")

zombies %>% ggplot(aes(x = zombies_killed, color = gender)) +
  geom_boxplot() +
  ggtitle("Zombies killed by gender/sex")

zombies %>% ggplot(aes(x = zombies_killed, color = gender)) +
  geom_boxplot() +
  ggtitle("Zombies killed by gender/sex")

zombies %>% ggplot(aes(x = years_of_education, color = gender)) +
  geom_boxplot() +
  ggtitle("Years of education by gender/sex")

zombies %>% ggplot(aes(x = age, color = gender)) +
  geom_boxplot() +
  ggtitle("Age by gender/sex")

1.4 Question 3

# tried using size and color, looks horrendous i'm ngl
zombies %>% ggplot(aes(x = age, y = weight, color = gender)) +
  geom_point() +
  geom_smooth(method = "lm") +
  ggtitle("Age and weight")
## `geom_smooth()` using formula = 'y ~ x'

zombies %>% ggplot(aes(x = age, y = height, color = gender)) + 
  geom_point() +
  geom_smooth(method = "lm") +
  ggtitle("Age and height")
## `geom_smooth()` using formula = 'y ~ x'

zombies %>% ggplot(aes(x = weight, y = height, color = gender)) +
  geom_point() +
  geom_smooth(method = "lm") +
  ggtitle("Age and height")
## `geom_smooth()` using formula = 'y ~ x'

There seems to be a linear relationship between height and age, and a fairly weak relationship between weight and age. The strongest linear relationship is between height and weight.

1.5 Question 4

We’re going to look at normality, variable by variable!

1.5.1 Height

# going to be making ugly plots
hist(zombies$height)

qqnorm(zombies$height)

# it looks normal!

1.5.2 Weight

hist(zombies$weight)

qqnorm(zombies$weight)

# it looks approximately normal

1.5.3 Zombies Killed

hist(zombies$zombies_killed)

qqnorm(zombies$zombies_killed)

# this is not normal
# i think this is poisson
# i'm breaking out kolmogorov smirnov
ks.test(zombies$zombies_killed, "ppois", pop_zkills_mean)
## Warning in ks.test.default(zombies$zombies_killed, "ppois", pop_zkills_mean):
## ties should not be present for the Kolmogorov-Smirnov test
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  zombies$zombies_killed
## D = 0.22602, p-value < 2.2e-16
## alternative hypothesis: two-sided

This is probably poisson distributed!

1.5.4 Years of education

hist(zombies$years_of_education)

qqnorm(zombies$years_of_education)

# this is not normal
# i think its poisson
# i'm breaking out ks again
ks.test(zombies$years_of_education, "ppois", pop_yrsedu_mean)
## Warning in ks.test.default(zombies$years_of_education, "ppois",
## pop_yrsedu_mean): ties should not be present for the Kolmogorov-Smirnov test
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  zombies$years_of_education
## D = 0.23513, p-value < 2.2e-16
## alternative hypothesis: two-sided

This is probably poisson distributed!

1.5.5 Age

hist(zombies$age)

qqnorm(zombies$age)

# this looks approximately normal

1.6 Question 5

set.seed(812) # it's my birthday I'm a narcissist
zombie_height <- sample(zombies$height, 30, replace = F)
zombie_weight <- sample(zombies$weight, 30, replace = F)
zombie_age <- sample(zombies$age, 30, replace = F)
zombie_years_of_education <- sample(zombies$years_of_education, 30, replace = F)
zombie_zkills <- sample(zombies$zombies_killed, 30, replace = F)

We’re going to construct t intervals for all these samples! I’ve personally never used a normal assumption except for binomial proportions (because the math just so happens to work out that way).

# height, copying and pasting an old code snippet
LB_height <- mean(zombie_height) - 
  (qt(0.975, df = length(zombie_height)) * sqrt(var(zombie_height)/length(zombie_height)));LB_height
## [1] 66.50284
UB_height <- mean(zombie_height) + 
  (qt(0.975, df = length(zombie_height)) * sqrt(var(zombie_height)/length(zombie_height)));UB_height
## [1] 69.54965
# weight, copying and pasting 
LB_weight <- mean(zombie_weight) - 
  (qt(0.975, df = length(zombie_weight)) * sqrt(var(zombie_weight)/length(zombie_weight)));LB_weight
## [1] 136.9727
UB_weight <- mean(zombie_weight) + 
  (qt(0.975, df = length(zombie_weight)) * sqrt(var(zombie_weight)/length(zombie_weight)));UB_weight
## [1] 150.3831
# zkills, copying and pasting
LB_zkills <- mean(zombie_zkills) - 
  (qt(0.975, df = length(zombie_zkills)) * sqrt(var(zombie_zkills)/length(zombie_zkills)));LB_zkills
## [1] 2.793601
UB_zkills <- mean(zombie_zkills) + 
  (qt(0.975, df = length(zombie_zkills)) * sqrt(var(zombie_zkills)/length(zombie_zkills)));UB_zkills
## [1] 4.073066
# height, copying and pasting our old 
LB_age <- mean(zombie_age) - 
  (qt(0.975, df = length(zombie_age)) * sqrt(var(zombie_age)/length(zombie_age)));LB_age
## [1] 18.10907
UB_age <- mean(zombie_age) + 
  (qt(0.975, df = length(zombie_age)) * sqrt(var(zombie_age)/length(zombie_age)));UB_age
## [1] 20.9116
# height, copying and pasting our old 
LB_years_of_education <- mean(zombie_years_of_education) - 
  (qt(0.975, df = length(zombie_years_of_education)) * sqrt(var(zombie_years_of_education)/length(zombie_years_of_education)));LB_years_of_education
## [1] 2.101328
UB_years_of_education <- mean(zombie_years_of_education) + 
  (qt(0.975, df = length(zombie_years_of_education)) * sqrt(var(zombie_years_of_education)/length(zombie_years_of_education)));UB_years_of_education
## [1] 3.232005

1.7 Question 6

Create the data…

set.seed(812) # it's my birthday!!! again!!!
# i was gonna do for loops but i couldn't be bothered
zombie_height <- replicate(100, mean(sample(zombies$height, 30, replace = F)))
zombie_weight <- replicate(100, mean(sample(zombies$weight, 30, replace = F)))
zombie_age <- replicate(100, mean(sample(zombies$age, 30, replace = F)))
zombie_years_of_education <- replicate(100, mean(sample(zombies$years_of_education, 30, replace = F)))
zombie_zkills <- replicate(100, mean(sample(zombies$zombies_killed, 30, replace = F)))

…Now lets compare confidence intervals!

# height, copying and pasting an old code snippet
LB_height <- mean(zombie_height) - 
  (qt(0.975, df = length(zombie_height)) * sqrt(var(zombie_height)/length(zombie_height)));LB_height
## [1] 67.40477
UB_height <- mean(zombie_height) + 
  (qt(0.975, df = length(zombie_height)) * sqrt(var(zombie_height)/length(zombie_height)));UB_height
## [1] 67.69399
# weight, copying and pasting 
LB_weight <- mean(zombie_weight) - 
  (qt(0.975, df = length(zombie_weight)) * sqrt(var(zombie_weight)/length(zombie_weight)));LB_weight
## [1] 143.5118
UB_weight <- mean(zombie_weight) + 
  (qt(0.975, df = length(zombie_weight)) * sqrt(var(zombie_weight)/length(zombie_weight)));UB_weight
## [1] 144.8491
# zkills, copying and pasting
LB_zkills <- mean(zombie_zkills) - 
  (qt(0.975, df = length(zombie_zkills)) * sqrt(var(zombie_zkills)/length(zombie_zkills)));LB_zkills
## [1] 2.925841
UB_zkills <- mean(zombie_zkills) + 
  (qt(0.975, df = length(zombie_zkills)) * sqrt(var(zombie_zkills)/length(zombie_zkills)));UB_zkills
## [1] 3.048826
# height, copying and pasting our old 
LB_age <- mean(zombie_age) - 
  (qt(0.975, df = length(zombie_age)) * sqrt(var(zombie_age)/length(zombie_age)));LB_age
## [1] 19.93236
UB_age <- mean(zombie_age) + 
  (qt(0.975, df = length(zombie_age)) * sqrt(var(zombie_age)/length(zombie_age)));UB_age
## [1] 20.11583
# height, copying and pasting our old 
LB_years_of_education <- mean(zombie_years_of_education) - 
  (qt(0.975, df = length(zombie_years_of_education)) * sqrt(var(zombie_years_of_education)/length(zombie_years_of_education)));LB_years_of_education
## [1] 2.950394
UB_years_of_education <- mean(zombie_years_of_education) + 
  (qt(0.975, df = length(zombie_years_of_education)) * sqrt(var(zombie_years_of_education)/length(zombie_years_of_education)));UB_years_of_education
## [1] 3.052939



The means are the same, but the spread is much smaller (because it’s a sampling distribution!)

Histograms?

hist(zombie_height)

hist(zombie_weight)

hist(zombie_age)

hist(zombie_zkills)

hist(zombie_years_of_education)

qqnorm(zombie_height)

qqnorm(zombie_weight)

qqnorm(zombie_age)

qqnorm(zombie_zkills)

qqnorm(zombie_years_of_education)

The poisson data is looking more normal but to call it normal would definitely be a reach. Imagine replicating 1000 times!